/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.db;
import java.io.*;
import java.net.*;
import java.util.*;
import net.nutch.io.*;
import net.nutch.util.*;
import net.nutch.net.UrlNormalizer;
/*********************************************
* A row in the Page Database.
* <pre>
* type name description
* ---------------------------------------------------------------
* byte VERSION - A byte indicating the version of this entry.
* String URL - The url of a page. This is the primary key.
* 128bit ID - The MD5 hash of the contents of the page.
* 64bit DATE - The date this page should be refetched.
* byte RETRIES - The number of times we've failed to fetch this page.
* byte INTERVAL - Frequency, in days, this page should be refreshed.
* float SCORE - Multiplied into the score for hits on this page.
* float NEXTSCORE - Multiplied into the score for hits on this page.
* </pre>
*
* @author Mike Cafarella
* @author Doug Cutting
*********************************************/
public class Page implements WritableComparable, Cloneable {
private final static byte CUR_VERSION = 4;
private static final byte DEFAULT_INTERVAL =
(byte)NutchConf.getInt("db.default.fetch.interval", 30);
private UTF8 url;
private MD5Hash md5;
private long nextFetch = System.currentTimeMillis();
private byte retries;
private byte fetchInterval = DEFAULT_INTERVAL;
private int numOutlinks;
private float score = 1.0f;
private float nextScore = 1.0f;
/** Construct a page ready to be read by {@link
* #readFields(DataInput)}.*/
public Page() {
url = new UTF8(); // initialize for readFields()
md5 = new MD5Hash(); // initialize for readFields()
}
/** Construct a new, default page, due to be fetched. */
public Page(String urlString, MD5Hash md5) throws MalformedURLException {
setURL(urlString);
this.md5 = md5;
}
/** Construct a new, default page, due to be fetched. */
public Page(String urlString, float score, float nextScore, long nextFetch)
throws MalformedURLException {
this(urlString, score, nextScore);
this.nextFetch = nextFetch;
}
/** Construct a new, default page, due to be fetched. */
public Page(String urlString, float score, float nextScore) throws MalformedURLException {
setURL(urlString);
this.md5 = MD5Hash.digest(url); // hash url, by default
this.score = score;
this.nextScore = nextScore;
}
public void readFields(DataInput in) throws IOException {
byte version = in.readByte(); // read version
if (version > CUR_VERSION) // check version
throw new VersionMismatchException(CUR_VERSION, version);
url.readFields(in);
md5.readFields(in);
nextFetch = in.readLong();
retries = in.readByte();
fetchInterval = in.readByte();
numOutlinks = (version > 2) ? in.readInt() : 0; // added in Version 3
score = (version>1) ? in.readFloat() : 1.0f; // score added in version 2
nextScore = (version>3) ? in.readFloat() : 1.0f; // 2nd score added in V4
}
/** Copy the contents of another instance into this instance. */
public void set(Page that) {
this.url.set(that.url);
this.md5.set(that.md5);
this.nextFetch = that.nextFetch;
this.retries = that.retries;
this.fetchInterval = that.fetchInterval;
this.numOutlinks = that.numOutlinks;
this.score = that.score;
this.nextScore = that.nextScore;
}
/**
* Write the bytes out to the bytestream
*/
public void write(DataOutput out) throws IOException {
out.writeByte(CUR_VERSION); // store current version
url.write(out);
md5.write(out);
out.writeLong(nextFetch);
out.write(retries);
out.write(fetchInterval);
out.writeInt(numOutlinks);
out.writeFloat(score);
out.writeFloat(nextScore);
}
/**
* Compare to another Page object
*/
public int compareTo(Object o) {
int md5Result = this.md5.compareTo(((Page) o).md5);
if (md5Result != 0) {
return md5Result;
}
return this.url.compareTo(((Page) o).url);
}
/** Compares pages by MD5, then by URL. */
public static class Comparator extends WritableComparator {
public Comparator() { super(Page.class); }
/** Optimized comparator. */
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
int urlLen1 = readUnsignedShort(b1, s1+1); // skip version byte
int urlLen2 = readUnsignedShort(b2, s2+1);
int urlStart1 = s1+1+2;
int urlStart2 = s2+1+2;
int md5Start1 = urlStart1 + urlLen1;
int md5Start2 = urlStart2 + urlLen2;
int c = compareBytes(b1, md5Start1, MD5Hash.MD5_LEN, // compare md5
b2, md5Start2, MD5Hash.MD5_LEN);
if (c != 0)
return c;
return compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2);
}
}
/** Compares pages by URL only. */
public static class UrlComparator extends WritableComparator {
public UrlComparator() { super(Page.class); }
public int compare(WritableComparable a, WritableComparable b) {
Page pageA = (Page)a;
Page pageB = (Page)b;
return pageA.getURL().compareTo(pageB.getURL());
}
/** Optimized comparator. */
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
int urlLen1 = readUnsignedShort(b1, s1+1); // skip version byte
int urlLen2 = readUnsignedShort(b2, s2+1);
int urlStart1 = s1+1+2;
int urlStart2 = s2+1+2;
return compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2);
}
}
public static Page read(DataInput in) throws IOException {
Page page = new Page();
page.readFields(in);
return page;
}
//
// Accessor methods
//
public UTF8 getURL() { return url; }
public void setURL(String url) throws MalformedURLException {
this.url = new UTF8(UrlNormalizer.normalize(url));
}
public MD5Hash getMD5() { return md5; }
public void setMD5(MD5Hash md5) { this.md5 = md5; }
public long getNextFetchTime() { return nextFetch; }
public void setNextFetchTime(long nextFetch) { this.nextFetch = nextFetch; }
public byte getRetriesSinceFetch() { return retries; }
public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;}
public byte getFetchInterval() { return fetchInterval; }
public void setFetchInterval(byte fetchInterval) {
this.fetchInterval = fetchInterval;
}
public int getNumOutlinks() { return numOutlinks; }
public void setNumOutlinks(int numOutlinks) {
this.numOutlinks = numOutlinks;
}
public float getScore() { return score; }
public float getNextScore() { return nextScore; }
public void setScore(float score, float nextScore) {
this.score = score;
this.nextScore = nextScore;
}
/**
* Compute domain ID from URL
*/
public long computeDomainID() throws MalformedURLException {
return MD5Hash.digest(new URL(url.toString()).getHost()).halfDigest();
}
/**
* Print out the Page
*/
public String toString() {
StringBuffer buf = new StringBuffer();
buf.append("Version: " + CUR_VERSION + "\n");
buf.append("URL: " + getURL() + "\n");
buf.append("ID: " + getMD5() + "\n");
buf.append("Next fetch: " + new Date(getNextFetchTime()) + "\n");
buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
buf.append("Retry interval: " + getFetchInterval() + " days\n");
buf.append("Num outlinks: " + getNumOutlinks() + "\n");
buf.append("Score: " + getScore() + "\n");
buf.append("NextScore: " + getNextScore() + "\n");
return buf.toString();
}
/**
* A tab-delimited text version of the Page's data.
*/
public String toTabbedString() {
StringBuffer buf = new StringBuffer();
buf.append(CUR_VERSION); buf.append("\t");
buf.append(getURL()); buf.append("\t");
buf.append(getMD5()); buf.append("\t");
buf.append(getNextFetchTime()); buf.append("\t");
buf.append(getRetriesSinceFetch()); buf.append("\t");
buf.append(getFetchInterval()); buf.append("\t");
buf.append(getNumOutlinks()); buf.append("\t");
buf.append(getScore()); buf.append("\t");
buf.append(getNextScore()); buf.append("\t");
return buf.toString();
}
public boolean equals(Object o) {
if (!(o instanceof Page))
return false;
Page other = (Page)o;
return
this.url.equals(other.url) &&
this.md5.equals(other.md5) &&
(this.nextFetch == other.nextFetch) &&
(this.retries == other.retries) &&
(this.fetchInterval == other.fetchInterval) &&
(this.score == other.score) &&
(this.nextScore == other.nextScore);
}
public int hashCode() {
return
url.hashCode() ^
md5.hashCode() ^
((int)nextFetch) ^
retries ^
fetchInterval ^
Float.floatToIntBits(score) ^
Float.floatToIntBits(nextScore);
}
public Object clone() {
try {
return super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
}